library(GenomicAlignments)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2     ✔ purrr   0.3.3
## ✔ tibble  3.1.6     ✔ dplyr   1.0.3
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ purrr::compact()    masks XVector::compact()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks GenomicAlignments::first(), S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::last()       masks GenomicAlignments::last()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ purrr::simplify()   masks DelayedArray::simplify()
## ✖ dplyr::slice()      masks XVector::slice(), IRanges::slice()
library(cqn)
## Loading required package: mclust
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
## 
##     map
## Loading required package: nor1mix
## Loading required package: preprocessCore
## Loading required package: splines
## Loading required package: quantreg
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(RColorBrewer)
library(edgeR)
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(ggplot2)
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
library(ggridges)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
colsBig <- clusterExperiment:::massivePalette
plotGCHex <- function(gr, counts){
  counts2 <- counts
  df <- as_tibble(cbind(counts2,gc=mcols(gr)$gc))
  df <- gather(df, sample, value, -gc)
  ggplot(data=df, aes(x=gc, y=log(value+1)) ) + 
    ylab("log(count + 1)") + xlab("GC-content") + 
    geom_hex(bins = 50) + theme_bw() #+ facet_wrap(~sample, nrow=2)
}
pal <- RColorBrewer::brewer.pal(n=8, "Dark2")
source("../../methods/gcqn_validated.R")

### this dataset combines samples from a number of different sources, therefore the GC effects are wildly different between samples.
data <- readRDS("../../data/gcapc/fcResults.rds")
gr <- as(data$annotation, "GRanges")
seqlevelsStyle(gr) <- "NCBI"
counts <- data$counts

# get GC content
ff <- FaFile("~/data/genomes/human/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz")
peakSeqs <- getSeq(x=ff, gr)
gcContentPeaks <- letterFrequency(peakSeqs, "GC",as.prob=TRUE)[,1]
gcGroups <- Hmisc::cut2(gcContentPeaks, g=20)
mcols(gr)$gc <- gcContentPeaks
widthGroups <- Hmisc::cut2(width(gr), g=20)


dfGCWidth <- data.frame(gc=gcContentPeaks,
                        width=width(gr))
ggplot(dfGCWidth, aes(x=gc, y=log(width))) +
  geom_hex() +
  geom_smooth(se=FALSE, color="red", aes(group=1), lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Lowess fits

GC content

lowListGC <- list()
for(kk in 1:ncol(counts)){
  set.seed(kk)
  lowListGC[[kk]] <- lowess(x=gcContentPeaks, y=log1p(counts[,kk]), f=1/10)
}

Visualization

dfList <- list()
for(ss in 1:length(lowListGC)){
  oox <- order(lowListGC[[ss]]$x)
  dfList[[ss]] <- data.frame(x=lowListGC[[ss]]$x[oox], y=lowListGC[[ss]]$y[oox], sample=ss)
}
dfAll <- do.call(rbind, dfList)
dfAll$sample <- factor(dfAll$sample)

## association of GC content with counts
plotGCHex(gr, rowMeans(counts)) +
  theme(axis.title = element_text(size=16)) +
  labs(fill="Nr. of peaks") + 
  geom_line(aes(x=x, y=y, group=sample, color=sample), data=dfAll, size=1) +
  scale_color_discrete()

## just the average GC content
p1 <- ggplot(dfAll, aes(x=x, y=y, group=sample, color=sample)) +
  geom_line(size = 1) +
  xlab("GC-content") +
  ylab("log(count + 1)") +
  theme_classic()

Peak width

lowListWidth <- list()
for(kk in 1:ncol(counts)){
  lowListWidth[[kk]] <- lowess(x=log(width(gr)), y=log1p(counts[,kk]), f=1/10)
}

plot(x=seq(min(log(width(gr))), max(log(width(gr))), length=10),
     y=seq(0, 5, length=10), type='n',
     xlab="Peak width", ylab="log(count + 1)")
for(kk in 1:length(lowListWidth)){
  oo <- order(lowListWidth[[kk]]$x)
  lines(x=lowListWidth[[kk]]$x[oo], y=lowListWidth[[kk]]$y[oo], col=colsBig[kk])
}

Mock comparisons

# Mock comparison in paper is 
## group A: 1001, 1003, 1008
## group B: 1002, 1004, 1010

# links are
## 1001: SRR7650766 
## 1002: SRR7650808
## 1003: SRR7650848
## 1004: SRR7650885
## 1008: SRR7650913
## 1010: SRR7650922

colnames(counts) <- unlist(lapply(strsplit(colnames(counts), split=".", fixed=TRUE), "[[", 1))

set.seed(33)
mock <- factor(sample(rep(letters[1:2], each=3)))
names(mock) <- colnames(counts)
mock # agrees 
## SRR7650766 SRR7650808 SRR7650848 SRR7650885 SRR7650913 SRR7650922 
##          a          b          a          b          a          b 
## Levels: a b
design <- model.matrix(~mock)

# filter
keepMemContr <- rowSums(cpm(counts) >= 2) >=3 
table(keepMemContr)
## keepMemContr
##  FALSE   TRUE 
## 272846  85415
countsMemControl <- counts[keepMemContr, ]
# equally sized bins after filtering
gcGroupsMem <- Hmisc::cut2(gcContentPeaks[keepMemContr], g=20)
gcGroupsMem10 <- Hmisc::cut2(gcContentPeaks[keepMemContr], g=10)
gcMem <- gcContentPeaks[keepMemContr]
widthGroupsMem <- Hmisc::cut2(width(gr)[keepMemContr], g=20)

No normalization

## TMM normalization
library(edgeR)
d <- DGEList(countsMemControl)
d <- estimateDisp(d, design)
fit <- glmFit(d, design)
lrt <- glmLRT(fit, coef=2) 
dfEdgeR <- data.frame(logFC=log(2^lrt$table$logFC),
                 gc=gcGroupsMem)
pNone <- ggplot(dfEdgeR, aes(x=gc, y=logFC, color=gc)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(gcGroups), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("No normalization") +
  xlab("GC-content bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pNone
## Warning: Removed 833 rows containing non-finite values (stat_ydensity).
## Warning: Removed 833 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 833 rows containing non-finite values (stat_smooth).

dfEdgeRWidth <- data.frame(logFC=log(2^lrt$table$logFC),
                 peakWidth=widthGroupsMem)

pNoneWidth <- ggplot(dfEdgeRWidth, aes(x=peakWidth, y=logFC, color=peakWidth)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(dfEdgeRWidth$peakWidth), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("No normalization") +
  xlab("Peak width bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pNoneWidth
## Warning: Removed 833 rows containing non-finite values (stat_ydensity).

## Warning: Removed 833 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 833 rows containing non-finite values (stat_smooth).

edgeR (TMM normalization)

## TMM normalization
library(edgeR)
d <- DGEList(countsMemControl)
d <- calcNormFactors(d)
d <- estimateDisp(d, design)
fit <- glmFit(d, design)
lrt <- glmLRT(fit, coef=2) 
dfEdgeR <- data.frame(logFC=log(2^lrt$table$logFC),
                 gc=gcGroupsMem)
pedgeR <- ggplot(dfEdgeR, aes(x=gc, y=logFC, color=gc)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(gcGroups), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("TMM normalization") +
  xlab("GC-content bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pedgeR
## Warning: Removed 721 rows containing non-finite values (stat_ydensity).
## Warning: Removed 721 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 721 rows containing non-finite values (stat_smooth).

dfEdgeRWidth <- data.frame(logFC=log(2^lrt$table$logFC),
                 peakWidth=widthGroupsMem)

pedgeRWidth <- ggplot(dfEdgeRWidth, aes(x=peakWidth, y=logFC, color=peakWidth)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(dfEdgeRWidth$peakWidth), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("TMM normalization") +
  xlab("Peak width bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pedgeRWidth
## Warning: Removed 721 rows containing non-finite values (stat_ydensity).

## Warning: Removed 721 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 721 rows containing non-finite values (stat_smooth).

DESeq2 (MOR normalization)

## DESeq2 normalization
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countsMemControl, 
                       colData=data.frame(mock=mock), 
                       design=~mock)
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, name="mock_b_vs_a")
dfDESeq2 <- data.frame(logFC=log(2^res$log2FoldChange),
                       gc=gcGroupsMem)
pdeseq <- ggplot(dfDESeq2) +
  aes(x=gc, y=logFC, color=gc) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(gcGroups), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("DESeq2 MOR normalization") +
  xlab("GC-content bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pdeseq
## Warning: Removed 825 rows containing non-finite values (stat_ydensity).
## Warning: Removed 825 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 825 rows containing non-finite values (stat_smooth).

dfDESeq2Width <- data.frame(logFC=log(2^res$log2FoldChange),
                 peakWidth=widthGroupsMem)

pdeseqWidth <- ggplot(dfDESeq2Width, aes(x=peakWidth, y=logFC, color=peakWidth)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(dfDESeq2Width$peakWidth), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("DESeq2 MOR normalization") +
  xlab("Peak width bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pdeseqWidth
## Warning: Removed 825 rows containing non-finite values (stat_ydensity).

## Warning: Removed 825 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 825 rows containing non-finite values (stat_smooth).

Full quantile

## Full quantile normalization
countsFQ <- FQnorm(countsMemControl, type="median")
d <- DGEList(countsFQ)
d <- estimateDisp(d, design)
fit <- glmFit(d, design)
lrtFQ <- glmLRT(fit, coef=2) 
dfFQ <- data.frame(logFC=log(2^lrtFQ$table$logFC),
                      gc=gcGroupsMem)
pFQ <- ggplot(dfFQ) +
  aes(x=gc, y=logFC, color=gc) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(gcGroups), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("FQ normalization") +
  xlab("GC-content bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pFQ
## Warning: Removed 1197 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1197 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1197 rows containing non-finite values (stat_smooth).

dfFQWidth <- data.frame(logFC=log(2^lrtFQ$table$logFC),
                 peakWidth=widthGroupsMem)

pFQWidth <- ggplot(dfFQWidth, aes(x=peakWidth, y=logFC, color=peakWidth)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(dfFQWidth$peakWidth), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  ggtitle("FQ normalization") +
  xlab("Peak width bin") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pFQWidth
## Warning: Removed 1197 rows containing non-finite values (stat_ydensity).

## Warning: Removed 1197 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1197 rows containing non-finite values (stat_smooth).

Composite plot for figure 1

p <- plot_grid(p1 + ggtitle("Memory NK cells"), pedgeR, 
               pdeseq, pFQ,
               labels=letters[1:4])
## Warning: Removed 721 rows containing non-finite values (stat_ydensity).
## Warning: Removed 721 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 721 rows containing non-finite values (stat_smooth).
## Warning: Removed 825 rows containing non-finite values (stat_ydensity).
## Warning: Removed 825 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 825 rows containing non-finite values (stat_smooth).
## Warning: Removed 1197 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1197 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1197 rows containing non-finite values (stat_smooth).
p

ggsave("~/Dropbox/research/atacseq/bulk/plots/figure1_gcapc.png",
       units="in", width=12, height=9)

pWidth <- plot_grid(pedgeRWidth, 
               pdeseqWidth, pFQWidth,
               labels=letters[1:3], 
               ncol=1)
## Warning: Removed 721 rows containing non-finite values (stat_ydensity).
## Warning: Removed 721 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 721 rows containing non-finite values (stat_smooth).
## Warning: Removed 825 rows containing non-finite values (stat_ydensity).
## Warning: Removed 825 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 825 rows containing non-finite values (stat_smooth).
## Warning: Removed 1197 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1197 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1197 rows containing non-finite values (stat_smooth).
pWidth

ggsave("~/Dropbox/research/atacseq/bulk/plots/figure1Width_gcapc.png",
       units="in", width=9, height=10)

Figure 2

##### FIGURE 2
## cqn
cqnModel <- cqn(countsMemControl, x=gcMem, sizeFactors = colSums(countsMemControl),
                lengths=width(gr)[keepMemContr])
## Warning: The use of 'sig2' is deprecated; do specify 'sigma' (= sqrt(sig2))
## instead
d <- DGEList(countsMemControl)
d$offset <- cqnModel$glm.offset
d <- estimateDisp(d, design)
fit <- glmFit(d, design)
lrtCqn <- glmLRT(fit, coef=2)
dfCqn <- data.frame(logFC=log(2^lrtCqn$table$logFC),
                   gc=gcGroupsMem)
pCqn <- ggplot(dfCqn) +
  aes(x=gc, y=logFC, color=gc) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(gcGroups), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  xlab("GC-content bin") +
  ggtitle("cqn normalization") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pCqn
## Warning: Removed 587 rows containing non-finite values (stat_ydensity).
## Warning: Removed 587 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 587 rows containing non-finite values (stat_smooth).

dfCqnWidth <- data.frame(logFC=log(2^lrtCqn$table$logFC),
                   peakWidth=widthGroupsMem)
pCqnWidth <- ggplot(dfCqnWidth, aes(x=peakWidth, y=logFC, color=peakWidth)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(dfCqnWidth$peakWidth), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  xlab("Peak width bin") +
  ggtitle("cqn normalization") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pCqnWidth
## Warning: Removed 587 rows containing non-finite values (stat_ydensity).

## Warning: Removed 587 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 587 rows containing non-finite values (stat_smooth).

# ## EDASeq
library(EDASeq)
## Loading required package: ShortRead
## 
## Attaching package: 'ShortRead'
## The following object is masked from 'package:dplyr':
## 
##     id
## The following object is masked from 'package:purrr':
## 
##     compose
## The following object is masked from 'package:tibble':
## 
##     view
#emptyRows <- which(rownames(countsMouse) == "")
#rownames(countsMouse)[emptyRows] <- paste0("emptyPeak",1:length(emptyRows))
dataWithin <- withinLaneNormalization(countsMemControl, y=gcMem,
                                      num.bins=20, which="full")
dataNorm <- betweenLaneNormalization(dataWithin, which="full")
d <- DGEList(dataNorm)
d <- estimateDisp(d, design)
## Warning: Zero sample variances detected, have been offset away from zero
fit <- glmFit(d, design)
lrtEDASeq <- glmLRT(fit, coef=2)
dfEDASeq <- data.frame(logFC=log(2^lrtEDASeq$table$logFC),
                    gc=gcGroupsMem)
pEDASeq <- ggplot(dfEDASeq) +
  aes(x=gc, y=logFC, color=gc) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(gcGroups), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() +
  ylim(c(-1,1)) +
  xlab("GC-content bin") +
  ggtitle("FQ-FQ normalization") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pEDASeq
## Warning: Removed 1000 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1000 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1000 rows containing non-finite values (stat_smooth).

dfEDASeqWidth <- data.frame(logFC=log(2^lrtEDASeq$table$logFC),
                   peakWidth=widthGroupsMem)
pEDASeqWidth <- ggplot(dfEDASeqWidth, aes(x=peakWidth, y=logFC, color=peakWidth)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(dfCqnWidth$peakWidth), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  xlab("Peak width bin") +
  ggtitle("FQ-FQ normalization") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pEDASeqWidth
## Warning: Removed 1000 rows containing non-finite values (stat_ydensity).

## Warning: Removed 1000 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1000 rows containing non-finite values (stat_smooth).

## GC-QN
countsGCQN <- gcqn(countsMemControl, gcGroupsMem, summary = "median")
d <- DGEList(countsGCQN)
d <- estimateDisp(d, design)
fit <- glmFit(d, design)
lrtGCQN <- glmLRT(fit, coef=2)
dfGCQN <- data.frame(logFC=log(2^lrtGCQN$table$logFC),
                   gc=gcGroupsMem)
pGCQN <- ggplot(dfGCQN) +
  aes(x=gc, y=logFC, color=gc) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(gcGroups), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  xlab("GC-content bin") +
  ggtitle("GC-FQ normalization") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pGCQN
## Warning: Removed 1184 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1184 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1184 rows containing non-finite values (stat_smooth).

dfGCQNWidth <- data.frame(logFC=log(2^lrtGCQN$table$logFC),
                   peakWidth=widthGroupsMem)
pGCQNWidth <- ggplot(dfGCQNWidth, aes(x=peakWidth, y=logFC, color=peakWidth)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(dfCqnWidth$peakWidth), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() + 
  ylim(c(-1,1)) +
  xlab("Peak width bin") +
  ggtitle("GC-FQ normalization") +
  theme(axis.text.x = element_text(angle = 45, vjust = .5),
        legend.position = "none",
        axis.title = element_text(size=16)) +
  geom_smooth(se=FALSE, color="blue", aes(group=1), lwd=1)
pGCQNWidth
## Warning: Removed 1184 rows containing non-finite values (stat_ydensity).

## Warning: Removed 1184 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1184 rows containing non-finite values (stat_smooth).

## ridges plot before normalization
countsN <- countsMemControl[,order(colSums(countsMemControl), decreasing=TRUE)[1:3]]


lc <- log1p(c(countsN))
joyDat <- data.frame(lc=lc, 
                     gc=rep(gcGroupsMem10, 3),
                     sample=rep(1:3, each=nrow(countsN)))
axText <- 0
pRidge1 <- joyDat %>% ggplot(aes(y=gc)) + 
  geom_density_ridges(aes(x=lc)) + 
  facet_wrap(.~sample) +
  theme_ridges(grid=FALSE, font_size=5, center_axis_labels = TRUE) + 
  xlim(c(0.5,7)) +
  xlab("log(count + 1)") +
  ylab("GC-content bin") +
  theme(axis.text.y = element_text(size=axText),
        axis.text.x = element_text(size=10),
        legend.position = "none",
        axis.title = element_text(size=16), 
             strip.background = element_blank(),
             strip.text.x = element_blank())

pFC <- cowplot::plot_grid(pCqn, pEDASeq, pGCQN,
                          labels=letters[2:4],
                          nrow=3, ncol=1)
## Warning: Removed 587 rows containing non-finite values (stat_ydensity).
## Warning: Removed 587 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 587 rows containing non-finite values (stat_smooth).
## Warning: Removed 1000 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1000 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1000 rows containing non-finite values (stat_smooth).
## Warning: Removed 1184 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1184 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1184 rows containing non-finite values (stat_smooth).
pFig2 <- cowplot::plot_grid(pRidge1, 
                   pFC,
                   labels=c("a",""))
## Picking joint bandwidth of 0.144
## Picking joint bandwidth of 0.131
## Picking joint bandwidth of 0.131
## Warning: Removed 1628 rows containing non-finite values (stat_density_ridges).
pFig2

ggsave("~/Dropbox/research/atacseq/bulk/plots/figure2_gcapc.png",
        units="in", width=12, height=9)

pWidth2 <- cowplot::plot_grid(pCqnWidth, pEDASeqWidth, pGCQNWidth,
                          labels=letters[1:3],
                          nrow=3, ncol=1)
## Warning: Removed 587 rows containing non-finite values (stat_ydensity).
## Warning: Removed 587 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 587 rows containing non-finite values (stat_smooth).
## Warning: Removed 1000 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1000 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1000 rows containing non-finite values (stat_smooth).
## Warning: Removed 1184 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1184 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1184 rows containing non-finite values (stat_smooth).
pWidth2

ggsave("~/Dropbox/research/atacseq/bulk/plots/figure2Width_gcapc.png",
       units="in", width=9, height=10)